The dataset on police incidents in the City of Seattle includes a collection of events related to 911 emergency calls recorded from 2010 to the present day. The dataset contains approximately 6 million records and has a total size of 5.6GB. It is an open dataset obtained from the official website of the City of Seattle and is available at the following link.
This dataset contains information about calls made to the police and the corresponding police responses in Seattle. The columns in the dataset are as follows:
The dataset provides detailed information on all calls received by the police, including call types, priorities, call and response times, as well as the geographic locations of the incidents. It also includes the initial and final classification of each call, enabling analysis of how incidents are categorized and resolved.
Given the size of the dataset (5.6 GB), the data is loaded and further transformed using Apache Spark tools.
To ensure the correct data types are used, a schema is specified according to which Spark loads the data frames.
schema <- list(
CAD_Event_Number = "character",
Event_Clearance_Description = "character",
Call_Type = "character",
Priority = "character",
Initial_Call_Type = "character",
Final_Call_Type = "character",
Original_Time_Queued = "character",
Arrived_Time = "character",
Precinct = "character",
Sector = "character",
Beat = "character",
Blurred_Longitude = "numeric",
Blurred_Latitude = "numeric"
)
df <- spark_read_csv(sc, name = "police_calls", path = "Call_Data.csv", header = TRUE, columns = schema)
df_clean <- df %>% na.omit()
## * Dropped 3180987 rows with 'na.omit' (10491323 => 7310336)
df_clean <- df_clean %>%
mutate(
Original_Time_Queued = to_timestamp(Original_Time_Queued, "MM/dd/yyyy hh:mm:ss a"),
Arrived_Time = to_timestamp(Arrived_Time, "yyyy MMM dd hh:mm:ss a")
)
df_clean <- df_clean %>%
filter(!is.na(Original_Time_Queued) & !is.na(Arrived_Time) & Original_Time_Queued < Arrived_Time)
rmarkdown::render(“Documentation.Rmd”)
df_clean %>% summarise(
min_original_time = min(Original_Time_Queued, na.rm = TRUE),
max_original_time = max(Original_Time_Queued, na.rm = TRUE),
min_arrived_time = min(Arrived_Time, na.rm = TRUE),
max_arrived_time = max(Arrived_Time, na.rm = TRUE)
) %>% collect()
## # A tibble: 1 × 4
## min_original_time max_original_time min_arrived_time
## <dttm> <dttm> <dttm>
## 1 2009-06-02 03:43:08 2025-10-07 23:58:31 2009-06-02 04:01:55
## # ℹ 1 more variable: max_arrived_time <dttm>
precinct_counts <- df_clean %>%
group_by(Precinct) %>%
summarise(Count = n()) %>%
collect()
ggplot(precinct_counts, aes(x = Precinct, y = Count, fill = Precinct)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Police Precincts",
x = "Police Precinct",
y = "Number of Occurrences",
fill = "Precinct")
rm(precinct_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1836210 98.1 3611037 192.9 2405122 128.5
## Vcells 3298312 25.2 8388608 64.0 4601928 35.2
clearence_counts <- df_clean %>%
group_by(Event_Clearance_Description) %>%
summarise(Count = n()) %>%
collect()
clearence_counts <- clearence_counts %>% filter(Count >= 5000)
ggplot(clearence_counts, aes(x = Event_Clearance_Description, y = Count, fill = Event_Clearance_Description)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Case Resolution Type",
x = "Resolution",
y = "Number of Occurrences",
fill = "Resolution Type") +
theme(axis.text.x = element_blank())
rm(clearence_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1865752 99.7 3611037 192.9 3611037 192.9
## Vcells 3367417 25.7 8388608 64.0 4601928 35.2
priority_counts <- df_clean %>%
group_by(Priority) %>%
summarise(Count = n())
ggplot(priority_counts, aes(x = Priority, y = Count, fill = Priority)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Priority",
x = "Priority",
y = "Number of Occurrences",
fill = "Priority")
rm(priority_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1860523 99.4 3611037 192.9 3611037 192.9
## Vcells 3354959 25.6 8388608 64.0 4601928 35.2
call_type_counts <- df_clean %>%
group_by(Call_Type) %>%
summarise(Count = n())
call_type_counts <- call_type_counts %>% filter(Count > 5000)
ggplot(call_type_counts, aes(x = Call_Type, y = Count, fill = Call_Type)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Call Method",
x = "Call Method",
y = "Number of Occurrences",
fill = "Call Method") +
theme(axis.text.x = element_blank())
rm(call_type_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1857752 99.3 3611037 192.9 3611037 192.9
## Vcells 3347804 25.6 8388608 64.0 4601928 35.2
location_sample <- df_clean %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_sample, aes(x = Blurred_Longitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Geographic Latitude",
x = "Geographic Latitude",
y = "Frequency")
rm(location_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1863942 99.6 3611037 192.9 3611037 192.9
## Vcells 9134456 69.7 35131308 268.1 43666942 333.2
location_sample <- df_clean %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_sample, aes(x = Blurred_Latitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Geographic Longitude",
x = "Geographic Longitude",
y = "Frequency")
rm(location_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1864156 99.6 3611037 192.9 3611037 192.9
## Vcells 9133472 69.7 35138058 268.1 43753357 333.9
df_clean_ll <- df_clean %>% filter(Blurred_Latitude >= 47 & Blurred_Latitude <= 48 & Blurred_Longitude >= -123 & Blurred_Longitude <= -122)
# Sample 10% of cleaned location data for histogram
location_clean_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_clean_sample, aes(x = Blurred_Longitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Cleaned Longitude",
x = "Longitude",
y = "Frequency")
rm(location_clean_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1864613 99.6 3611037 192.9 3611037 192.9
## Vcells 8885911 67.8 34715674 264.9 43753357 333.9
# Sample 10% of cleaned location data for histogram
location_clean_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_clean_sample, aes(x = Blurred_Latitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Cleaned Latitude",
x = "Latitude",
y = "Frequency")
rm(location_clean_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1864702 99.6 3611037 192.9 3611037 192.9
## Vcells 8884790 67.8 33758969 257.6 43753357 333.9
# Setting the API key value
register_stadiamaps("04c1350e-f51f-4265-b458-ad6b6a3192bb", write = TRUE)
# Creating a map of Seattle
seattle <- c(left = -122.45, bottom = 47.48, right = -122.2, top = 47.73)
seattle_map <- get_stadiamap(seattle, zoom = 18)
# Plotting the map
ggmap(seattle_map) +
geom_point(data = df_clean_ll, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Final_Call_Type)) +
labs(title = "Map of Seattle City",
x = "Longitude",
y = "Latitude",
color = "Final Call Type")
include_graphics("Images/PointsOnTheMap.png")
df_collected <- df_clean_ll %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
print(length(unique(df_collected$Initial_Call_Type)))
## [1] 273
print(length(unique(df_collected$Final_Call_Type)))
## [1] 245
df_clean_ll <- df_clean_ll %>%
mutate(Initial_Category = case_when(
grepl("ASLT|Assault|ASSAULT|ASSAULTS|HARRASMENT|THREAT|THREATS|WEAPON|GUN|PANHANDLING|HARASSMENT|VIOLENT", Initial_Call_Type) ~ "Assaults and Threats",
grepl("TRAFFICING|SEX|RAPE|PORNOGRAPHY|PROSTITUTION|LEWD|PROWLER", Initial_Call_Type) ~ "Sex Offenses",
grepl("NARCOTICS|DRUGS|MARIJUANA|OVERDOSE|OD|LIQUOR|DETOX|INTOX|LIQ", Initial_Call_Type) ~ "Narcotics",
grepl("HARBOR|ANIMAL|GAMBLING|WATER|TREES|NORAD|STADIUM|ILLEGAL DUMPING|SLEEPER|HAZ|BIAS|NUISANCE|URINATING|HOSPITAL|PHONE|CROWD|EVENT|DEMONSTRATIONS|DISTURBANCE|UNUSUAL|NOISE|POWER|LANDLINE|LITTERING", Initial_Call_Type) ~ "Civil incidents and security",
grepl("DOA|SHOTS|CASUALTY|FELONY|SUSPICIOUS|ESCAPE|FIRE|PURSUIT|SWAT|SHOOTING|SUICIDE|HOSTAGE|HOMICIDE", Initial_Call_Type) ~ "Emergency and Critical incidents",
grepl("ROBBERY|BURGLARY|PROPERTY|THEFT|BREAKING|SHOPLIFT|ARSON|TRESPASS|BURG|BURN|EXPLOSION|FRAUD", Initial_Call_Type) ~ "Property Crimes",
grepl("ALARM|ORDER|INSPECTION|WATCH", Initial_Call_Type) ~ "Alarm and Security",
grepl("ASSIST|CHECK|HELP|ASSIGNED|PATROL", Initial_Call_Type) ~ "Assistance and Checks",
grepl("DOMESTIC|ABUSE|CUSTODIAL|ARGUMENTS|DV", Initial_Call_Type) ~ "Domestic Violence",
grepl("Traffic|VIOLATIONS|ACCIDENT|MVC|CAR|DUI|TRAF|ROAD|VEHICLE|DUI|ACC|HIT AND RUN|", Initial_Call_Type) ~ "Traffic Incident",
grepl("MISSING|AWOL|FOUND|RUNAWAY|ABDUCTION|KIDNAP|CHILD|JUVENILE|LOST|AMBER|A.W.O.L.", Initial_Call_Type) ~ "Missing Persons",
grepl("OBS", Initial_Call_Type) ~ "Observation",
grepl("CANCELLED|NO ANSWER|OUT AT RANGE", Initial_Call_Type) ~ "No action",
TRUE ~ "Other"
))
df_clean_ll <- df_clean_ll %>%
mutate(Final_Category = case_when(
grepl("ASLT|Assault|ASSAULT|ASSAULTS|HARRASMENT|THREAT|THREATS|WEAPON|GUN|PANHANDLING|HARASSMENT|VIOLENT", Final_Call_Type) ~ "Assaults and Threats",
grepl("TRAFFICING|SEX|RAPE|PORNOGRAPHY|PROSTITUTION|LEWD|PROWLER", Final_Call_Type) ~ "Sex Offenses",
grepl("NARCOTICS|DRUGS|MARIJUANA|OVERDOSE|OD|LIQUOR|DETOX|INTOX|LIQ", Final_Call_Type) ~ "Narcotics",
grepl("HARBOR|ANIMAL|GAMBLING|WATER|TREES|NORAD|STADIUM|ILLEGAL DUMPING|SLEEPER|HAZ|BIAS|NUISANCE|URINATING|HOSPITAL|PHONE|CROWD|EVENT|DEMONSTRATIONS|DISTURBANCE|UNUSUAL|NOISE|POWER|LANDLINE|LITTERING", Final_Call_Type) ~ "Civil incidents and security",
grepl("DOA|SHOTS|CASUALTY|FELONY|SUSPICIOUS|ESCAPE|FIRE|PURSUIT|SWAT|SHOOTING|SUICIDE|HOSTAGE|HOMICIDE", Final_Call_Type) ~ "Emergency and Critical incidents",
grepl("ROBBERY|BURGLARY|PROPERTY|THEFT|BREAKING|SHOPLIFT|ARSON|TRESPASS|BURG|BURN|EXPLOSION|FRAUD", Final_Call_Type) ~ "Property Crimes",
grepl("ALARM|ORDER|INSPECTION|WATCH", Final_Call_Type) ~ "Alarm and Security",
grepl("ASSIST|CHECK|HELP|ASSIGNED|PATROL", Final_Call_Type) ~ "Assistance and Checks",
grepl("DOMESTIC|ABUSE|CUSTODIAL|ARGUMENTS|DV", Final_Call_Type) ~ "Domestic Violence",
grepl("Traffic|VIOLATIONS|ACCIDENT|MVC|CAR|DUI|TRAF|ROAD|VEHICLE|DUI|ACC|HIT AND RUN|", Final_Call_Type) ~ "Traffic Incident",
grepl("MISSING|AWOL|FOUND|RUNAWAY|ABDUCTION|KIDNAP|CHILD|JUVENILE|LOST|AMBER|A.W.O.L.", Final_Call_Type) ~ "Missing Persons",
grepl("OBS", Final_Call_Type) ~ "Observation",
grepl("CANCELLED|NO ANSWER|OUT AT RANGE", Final_Call_Type) ~ "No action",
TRUE ~ "Other"
))
same_diff_counts <- df_clean_ll %>%
group_by(Same_Category = ifelse(Initial_Category == Final_Category, "Same", "Different")) %>%
summarise(Count = n())
# Bar chart for displaying values
ggplot(same_diff_counts, aes(x = Same_Category, y = Count, fill = Same_Category)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of Initial and Final Categories",
x = "Category",
y = "Number of Occurrences",
fill = "Category")
rm(same_diff_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3796464 202.8 7693420 410.9 7693420 410.9
## Vcells 44124820 336.7 129133291 985.3 160985511 1228.3
initial_category_counts <- df_clean_ll %>%
group_by(Initial_Category) %>%
summarise(Count = n())
ggplot(initial_category_counts, aes(x = reorder(Initial_Category, -Count), y = Count, fill = Initial_Category)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Initial Categorizations",
x = "Initial Category",
y = "Number of Occurrences") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
rm(initial_category_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3802986 203.2 7693420 410.9 7693420 410.9
## Vcells 44140247 336.8 129133291 985.3 160985511 1228.3
final_category_counts <- df_clean_ll %>%
group_by(Final_Category) %>%
summarise(Count = n())
ggplot(final_category_counts, aes(x = reorder(Final_Category, -Count), y = Count, fill = Final_Category)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Final Categorizations",
x = "Final Category",
y = "Number of Occurrences") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
rm(final_category_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3802275 203.1 7693420 410.9 7693420 410.9
## Vcells 44138603 336.8 129133291 985.3 160985511 1228.3
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Final_Category) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 1% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Final_Category)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Final Category and Call Location",
x = "Longitude",
y = "Latitude",
color = "Final Category") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3817608 203.9 7693420 410.9 7693420 410.9
## Vcells 88363790 674.2 186127938 1420.1 160985511 1228.3
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Sector) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 1% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Sector)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Sector and Call Location",
x = "Latitude",
y = "Longitude",
color = "Sector") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3823571 204.3 7693420 410.9 7693420 410.9
## Vcells 88399077 674.5 187501544 1430.6 187501544 1430.6
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Precinct) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 1% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Precinct)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Police Precinct and Call Location",
x = "Latitude",
y = "Longitude",
color = "Police Precinct") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3815970 203.8 7693420 410.9 7693420 410.9
## Vcells 88366780 674.2 187501544 1430.6 187501544 1430.6
Arrival_Time feature to indicate whether the response was
fast or not, and then perform binary classification. To determine what
constitutes a fast response versus a slow one, a histogram of
five-minute intervals will first be presented to examine the
distribution of occurrences.df_times <- df_clean_ll %>%
mutate(
Response_Time = (unix_timestamp(Arrived_Time) - unix_timestamp(Original_Time_Queued)) / 60
)
# Plot histogram with 5-minute bins
df_times_filtered <- df_times %>% filter(Response_Time >= 0 & Response_Time <= 1000)
response_time_sample <- df_times_filtered %>%
select(Response_Time) %>%
sdf_sample(fraction = 0.1, replacement = FALSE) %>%
collect()
ggplot(response_time_sample, aes(x = Response_Time)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Distribution of Response Time",
x = "Response Time (minutes)",
y = "Frequency")
rm(response_time_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3797724 202.9 7693420 410.9 7693420 410.9
## Vcells 44679634 340.9 150001236 1144.5 187501544 1430.6
Response_Speed was created with values 1 and 0. The values
of this feature are assigned based on the median response time -
Response_Time. All instances with a response time below the
median are assigned a Response_Speed value of 1,
representing a fast response, while the remaining instances are assigned
a value of 0, representing a slow response. The following code
demonstrates how this feature is created in the dataset:# Calculate median in Spark without collecting all data
median_result <- df_times_filtered %>%
summarise(median_time = percentile_approx(Response_Time, 0.5)) %>%
collect()
median_value <- median_result$median_time
# Create Response_Speed label
df_prepared <- df_times_filtered %>%
mutate(Response_Speed = if_else(Response_Time <= median_value, 1, 0))
response_speeds <- df_prepared %>%
group_by(Response_Speed) %>%
summarise(Count = n()) %>%
collect()
ggplot(response_speeds, aes(x = Response_Speed, y = Count, fill = Response_Speed)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Response Speed",
x = "Response Speed",
y = "Number of Occurrences",
fill = "Response Speed")
rm(response_speeds)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3811497 203.6 7693420 410.9 7693420 410.9
## Vcells 44150913 336.9 150001236 1144.5 187501544 1430.6
viz_sample <- df_prepared %>%
select(Priority, Initial_Category, Blurred_Longitude, Blurred_Latitude,
Sector, Response_Time, Response_Speed) %>%
sdf_sample(fraction = 0.05, replacement = FALSE) %>%
collect()
ggplot(viz_sample, aes(x = Priority, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Priority", x = "Priority", y = "Response Time")
ggplot(viz_sample, aes(x = Initial_Category, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Initial Category", x = "Initial Category", y = "Response Time") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(viz_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Response_Speed)) +
geom_point() +
labs(title = "Response Speed vs. Location", x = "Longitude", y = "Latitude", color = "Response Time")
ggplot(viz_sample, aes(x = Sector, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Sector", x = "Sector", y = "Response Time") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
rm(viz_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3878254 207.2 7693420 410.9 7693420 410.9
## Vcells 52871890 403.4 150001236 1144.5 187501544 1430.6
Domestic Violence and Emergency and
Critical Incidents, also result in a quick response.Response_Time, Sector, Precinct,
Initial_Category, Longitude, and
Latitude.df_prepared_split <- df_prepared %>%
select(Response_Time, Response_Speed, Sector, Precinct, Initial_Category, Blurred_Longitude, Blurred_Latitude, Priority) %>%
sdf_random_split(training = 0.8,
test = 0.2,
seed = 100)
train_data <- df_prepared_split$training
test_data <- df_prepared_split$test
# Logistic Regression
log_pipeline <- sc %>%
ml_pipeline() %>%
ft_r_formula(Response_Speed ~ .) %>%
ml_logistic_regression()
param_grid <- list(
logistic_regression = list(reg_param = c(0.01, 0.1, 1))
)
lr_evaluator <- ml_binary_classification_evaluator(x = sc, metricName = "areaUnderROC")
# Cross-validation
logistic_cv <- ml_cross_validator(
x = sc,
estimator = log_pipeline,
estimator_param_maps = param_grid,
evaluator = lr_evaluator,
num_folds = 5
)
print(logistic_cv)
## CrossValidator (Estimator)
## <cross_validator__c1de1852_2a77_4cc7_8659_6d7dc7a93258>
## (Parameters -- Tuning)
## estimator: Pipeline
## <pipeline__19dc9532_368f_4f3b_8ed9_221f3df1163a>
## evaluator: BinaryClassificationEvaluator
## <binary_classification_evaluator__32d5f291_ea1a_4abb_bd15_e7bcb0d20f20>
## with metric areaUnderROC
## num_folds: 5
## [Tuned over 3 hyperparameter sets]
model_cv <- ml_fit(
x = logistic_cv,
dataset = df_prepared_split$training
)
cv_metrics <- ml_validation_metrics(model_cv)
print(cv_metrics)
## areaUnderROC reg_param_1
## 1 0.9367412 0.01
## 2 0.8739719 0.10
## 3 0.8339177 1.00
cv_metrics %>%
ggplot(aes(reg_param_1, areaUnderROC)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 0.00505
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.09495
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.81893
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 0.00505
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.09495
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 0.81893
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Na osnovu grafa se utvrdjuje da je model sa parametrom regularizacije 0.01 najbolji
lmodel <- ml_logistic_regression(
df_prepared_split$training,
Response_Speed ~ .,
reg_param = 0.01
)
lrmx <- lmodel %>%
ml_predict(df_prepared_split$test) %>%
ml_metrics_binary()
# Random Forest
rf_pipeline <- sc %>%
ml_pipeline() %>%
ft_r_formula(Response_Speed ~ .) %>%
ml_random_forest_classifier()
rf_grid <- list(
random_forest_classifier = list(
num_trees = c(3, 25, 50)
)
)
rf_evaluator = ml_binary_classification_evaluator(x = sc, metricName = "areaUnderROC")
# Cross-validation
rf_cv <- ml_cross_validator(
x = sc,
estimator = rf_pipeline,
estimator_param_maps = rf_grid,
evaluator = rf_evaluator,
num_folds = 5
)
model_rf_cv <- ml_fit(
x = rf_cv,
dataset = df_prepared_split$training
)
rf_cv_metrics <- ml_validation_metrics(model_rf_cv)
print(rf_cv_metrics)
## areaUnderROC num_trees_1
## 1 0.9717074 3
## 2 0.9939046 25
## 3 0.9944274 50
rf_cv_metrics %>%
ggplot(aes(num_trees_1, areaUnderROC)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 2.765
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 22.235
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 636.81
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2.765
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 22.235
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 636.81
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Selection of the Best Model
rfmodel <- ml_random_forest_classifier(
df_prepared_split$training,
Response_Speed ~ .,
num_trees = 50
)
rfmx <- rfmodel %>%
ml_predict(df_prepared_split$test) %>%
ml_metrics_binary()
dt_pipeline <- sc %>%
ml_pipeline() %>%
ft_r_formula(Response_Speed ~ .) %>%
ml_decision_tree_classifier()
dt_grid <- list(
decision_tree_classifier = list(
max_depth = c(3, 5, 10)
)
)
dt_evaluator <- ml_binary_classification_evaluator(x = sc, metricName = "areaUnderROC")
# Cross-validation
dt_cv <- ml_cross_validator(
x = sc,
estimator = dt_pipeline,
estimator_param_maps = dt_grid,
evaluator = dt_evaluator,
num_folds = 5
)
model_dt_cv <- ml_fit(
x = dt_cv,
dataset = df_prepared_split$training
)
dt_cv_metrics <- ml_validation_metrics(model_dt_cv)
print(dt_cv_metrics)
## areaUnderROC max_depth_1
## 1 0.9904559 3
## 2 0.9994886 5
## 3 0.9994836 10
dt_cv_metrics %>%
ggplot(aes(max_depth_1, areaUnderROC)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 2.965
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 25.351
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2.965
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.035
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 25.351
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Izbor najpovoljnijeg modela
dtmodel <- ml_decision_tree_classifier(
df_prepared_split$training,
Response_Speed ~ .,
max_depth = 5
)
dtmx <- dtmodel %>%
ml_predict(df_prepared_split$test) %>%
ml_metrics_binary()
metrics_df <- data.frame(
model = c("Logistic Regression", "Random Forest", "Decision Tree"),
auc_roc = NA,
pr_auc = NA
)
# Fill in the metrics for each model
metrics_df[1, "auc_roc"] <- lrmx$.estimate[1]
metrics_df[1, "pr_auc"] <- lrmx$.estimate[2]
metrics_df[2, "auc_roc"] <- rfmx$.estimate[1]
metrics_df[2, "pr_auc"] <- rfmx$.estimate[2]
metrics_df[3, "auc_roc"] <- dtmx$.estimate[1]
metrics_df[3, "pr_auc"] <- dtmx$.estimate[2]
print(metrics_df)
## model auc_roc pr_auc
## 1 Logistic Regression 0.9366460 0.9257674
## 2 Random Forest 0.9942370 0.9962217
## 3 Decision Tree 0.9994909 0.9994986
ggplot(metrics_df, aes(x = model, y = auc_roc, fill = model)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "ROC-AUC Comparison", y = "ROC-AUC") +
theme_minimal()
ggplot(metrics_df, aes(x = model, y = pr_auc, fill = model)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "PR-AUC Comparison", y = "PR-AUC") +
theme_minimal()
dataset.clustering <- df_times %>%
select(Blurred_Longitude, Blurred_Latitude, Final_Category, Precinct)
# Function that calculates the within-cluster sum of squares
calculate_wcss <- function(data, max_k) {
wcss <- numeric(max_k)
for (kc in 2:max_k) {
model <- ml_bisecting_kmeans(data, ~Blurred_Longitude + Blurred_Latitude, k = kc, max_iter = 10)
wcss[kc] <- sum(model$cost)
}
return(wcss)
}
max_k <- 20
wcss_values <- calculate_wcss(dataset.clustering, max_k)
wcss_df <- data.frame(
k = 1:max_k,
WCSS = wcss_values
)
ggplot(wcss_df, aes(x = k, y = WCSS)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "Elbow Method for Determining the Optimal Number of Clusters",
x = "Number of Clusters (k)",
y = "Within-Cluster Sum of Squares (WCSS)") +
theme_minimal() +
theme(text = element_text(size = 16))
model.k7 <- ml_bisecting_kmeans(dataset.clustering, ~Blurred_Longitude + Blurred_Latitude, k = 7, seed = 1, max_iter = 10)
clusters.k7 <- ml_predict(model.k7, dataset.clustering)
model.k10 <- ml_bisecting_kmeans(dataset.clustering, ~Blurred_Longitude + Blurred_Latitude, k = 10, seed = 1, max_iter = 10)
clusters.k10 <- ml_predict(model.k10, dataset.clustering)
variance_within_clusters <- function(data) {
data %>%
group_by(prediction) %>%
summarise(across(c(Blurred_Longitude, Blurred_Latitude), var))
}
cluster_summary <- function(data) {
data %>%
group_by(prediction) %>%
summarise(across(c(Blurred_Longitude, Blurred_Latitude), list(mean = mean, sd = sd, median = median)))
}
centers.k7 <- model.k7$centers
sizes.k7 <- table(dplyr::pull(clusters.k7, prediction))
variance.k7 <- variance_within_clusters(clusters.k7)
summary.k7 <- cluster_summary(clusters.k7)
print(centers.k7)
## Blurred_Longitude Blurred_Latitude
## 1 -122.3631 47.54180
## 2 -122.2873 47.54420
## 3 -122.3444 47.60907
## 4 -122.3154 47.60659
## 5 -122.3654 47.66635
## 6 -122.3076 47.66828
## 7 -122.3266 47.71304
print(sizes.k7)
##
## 0 1 2 3 4 5 6
## 554795 671762 1174157 1242294 759705 500271 623915
print(variance.k7)
## Warning: Missing values are always removed in SQL aggregation functions.
## Use `na.rm = TRUE` to silence this warning
## This warning is displayed once every 8 hours.
## # Source: SQL [?? x 3]
## # Database: spark_connection
## prediction Blurred_Longitude Blurred_Latitude
## <int> <dbl> <dbl>
## 1 4 0.000378 0.000378
## 2 1 0.000354 0.000399
## 3 5 0.000261 0.000142
## 4 6 0.000448 0.000162
## 5 3 0.000131 0.000170
## 6 0 0.000384 0.000313
## 7 2 0.000278 0.000169
print(summary.k7)
## # Source: SQL [?? x 7]
## # Database: spark_connection
## prediction Blurred_Longitude_mean Blurred_Longitude_sd Blurred_Longitude_med…¹
## <int> <dbl> <dbl> <dbl>
## 1 4 -122. 0.0194 -122.
## 2 1 -122. 0.0188 -122.
## 3 5 -122. 0.0162 -122.
## 4 6 -122. 0.0212 -122.
## 5 3 -122. 0.0114 -122.
## 6 0 -122. 0.0196 -122.
## 7 2 -122. 0.0167 -122.
## # ℹ abbreviated name: ¹Blurred_Longitude_median
## # ℹ 3 more variables: Blurred_Latitude_mean <dbl>, Blurred_Latitude_sd <dbl>,
## # Blurred_Latitude_median <dbl>
centers.k10 <- model.k10$centers
sizes.k10 <- table(dplyr::pull(clusters.k10, prediction))
variance.k10 <- variance_within_clusters(clusters.k10)
summary.k10 <- cluster_summary(clusters.k10)
print(centers.k10)
## Blurred_Longitude Blurred_Latitude
## 1 -122.3631 47.54180
## 2 -122.2873 47.54420
## 3 -122.3909 47.57914
## 4 -122.3400 47.61188
## 5 -122.3151 47.59521
## 6 -122.3156 47.61687
## 7 -122.3770 47.67496
## 8 -122.3494 47.65050
## 9 -122.3076 47.66828
## 10 -122.3266 47.71304
print(sizes.k10)
##
## 0 1 2 3 4 5 6 7 8 9
## 554795 671762 100518 1073639 589620 652674 453997 305708 500271 623915
print(variance.k10)
## # Source: SQL [?? x 3]
## # Database: spark_connection
## prediction Blurred_Longitude Blurred_Latitude
## <int> <dbl> <dbl>
## 1 4 0.000134 0.0000646
## 2 8 0.000261 0.000142
## 3 1 0.000354 0.000399
## 4 5 0.000128 0.0000422
## 5 7 0.000180 0.000137
## 6 6 0.000210 0.000293
## 7 9 0.000448 0.000162
## 8 3 0.0000653 0.0000893
## 9 0 0.000384 0.000313
## 10 2 0.000177 0.0000405
print(summary.k10)
## # Source: SQL [?? x 7]
## # Database: spark_connection
## prediction Blurred_Longitude_mean Blurred_Longitude_sd Blurred_Longitude_me…¹
## <int> <dbl> <dbl> <dbl>
## 1 4 -122. 0.0116 -122.
## 2 8 -122. 0.0162 -122.
## 3 1 -122. 0.0188 -122.
## 4 5 -122. 0.0113 -122.
## 5 7 -122. 0.0134 -122.
## 6 6 -122. 0.0145 -122.
## 7 9 -122. 0.0212 -122.
## 8 3 -122. 0.00808 -122.
## 9 0 -122. 0.0196 -122.
## 10 2 -122. 0.0133 -122.
## # ℹ abbreviated name: ¹Blurred_Longitude_median
## # ℹ 3 more variables: Blurred_Latitude_mean <dbl>, Blurred_Latitude_sd <dbl>,
## # Blurred_Latitude_median <dbl>
plot_clusters <- function(data, centers, title) {
ggplot(data, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = as.factor(prediction))) +
geom_point(size = 2) +
geom_point(data = centers, aes(x = Blurred_Longitude, y = Blurred_Latitude), color = "black", size = 3, shape = 4) +
labs(color = "Cluster", title = title, x = "Longitude", y = "Latitude") +
theme_minimal() +
theme(text = element_text(size = 16))
}
sample_data.k7 <- clusters.k7 %>% sample_frac(0.01)
sample_data.k10 <- clusters.k10 %>% sample_frac(0.01)
heatmap3 <- plot_clusters(collect(sample_data.k7), centers.k7, "Density Heatmap for K=7")
heatmap4 <- plot_clusters(collect(sample_data.k10), centers.k10, "Density Heatmap for K=10")
ggsave("heatmap3.jpg", heatmap3)
## Saving 7 x 5 in image
ggsave("heatmap4.jpg", heatmap4)
## Saving 7 x 5 in image
include_graphics(c("heatmap3.jpg", "heatmap4.jpg"))
ggplot(data = collect(sample_data.k7), aes(x = as.factor(prediction))) +
geom_bar(aes(fill = Final_Category), position = "fill") +
labs(x = "Cluster", y = "Proportion", title = "Distribution of Final Categories within Clusters") +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(text = element_text(size = 16))
# Precinct and Cluster
ggplot(data = collect(sample_data.k7), aes(x = as.factor(prediction))) +
geom_bar(aes(fill = Precinct), position = "fill") +
labs(x = "Cluster", y = "Proportion", title = "Distribution of Police Departments within Clusters") +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(text = element_text(size = 16))
ggplot(data = collect(sample_data.k10), aes(x = as.factor(prediction))) +
geom_bar(aes(fill = Final_Category), position = "fill") +
labs(x = "Cluster", y = "Proportion", title = "Distribution of Final Categories within Clusters") +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(text = element_text(size = 16))
# Precinct and Cluster
ggplot(data = collect(sample_data.k10), aes(x = as.factor(prediction))) +
geom_bar(aes(fill = Precinct), position = "fill") +
labs(x = "Cluster", y = "Proportion", title = "Distribution of Police Departments within Clusters") +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(text = element_text(size = 16))